{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "bf6b376b",
   "metadata": {},
   "source": [
    "### Comparison of different misexpression mechanisms \n",
    "\n",
    "* Assign each misexpressed gene-sample pair to a mechanism "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4042a155",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path \n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5be62164",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "outdir = wkdir_path.joinpath(\"6_misexp_dissect/combined\")\n",
    "outdir.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7657131c",
   "metadata": {},
   "outputs": [],
   "source": [
    "misexp_vrnt_feat_path = wkdir_path.joinpath(\"6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv\")\n",
    "misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ac334c13",
   "metadata": {},
   "outputs": [],
   "source": [
    "# transcriptional readthrough candidate SVs \n",
    "tx_read_vrnt_gene_smpl_path = wkdir_path.joinpath(\"6_misexp_dissect/tx_readthrough/combine/tx_read_vrnts_gene_smpl.tsv\")\n",
    "tx_read_vrnt_gene_smpl_df = pd.read_csv(tx_read_vrnt_gene_smpl_path, sep=\"\\t\")\n",
    "tx_read_gene_smpl_df = tx_read_vrnt_gene_smpl_df[[\"gene_id\", \"rna_id\"]].drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1feaee66",
   "metadata": {},
   "outputs": [],
   "source": [
    "tx_fusion_vrnt_gene_smpl_path = wkdir_path.joinpath(\"6_misexp_dissect/star_fusion/results_stringent_qc/tx_fusion_vrnts_gene_smpl.tsv\")\n",
    "tx_fusion_vrnt_gene_smpl_df = pd.read_csv(tx_fusion_vrnt_gene_smpl_path, sep=\"\\t\")\n",
    "tx_fusion_gene_smpl_df = tx_fusion_vrnt_gene_smpl_df[[\"gene_id\", \"rna_id\"]].drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7421fc58",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['ENSG00000175077' 'ENSG00000171540']\n"
     ]
    }
   ],
   "source": [
    "# check overlap tx fusion and tx readthrough \n",
    "tx_read_fusion_merge_df = pd.merge(tx_fusion_gene_smpl_df, tx_read_gene_smpl_df, on=[\"gene_id\", \"rna_id\"], how=\"left\", indicator=True)\n",
    "# find two intergenic fusion examples - label these as tx readthrough\n",
    "tx_read_fusion_gene_id = tx_read_fusion_merge_df[tx_read_fusion_merge_df[\"_merge\"] == \"both\"].gene_id.unique() # RTP1 and OTP \n",
    "print(tx_read_fusion_gene_id)  # RTP1 and OTP \n",
    "tx_fusion_gene_smpl_rmv_tx_read_df = tx_read_fusion_merge_df[tx_read_fusion_merge_df['_merge'] == 'left_only'].drop(columns=['_merge'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f3f94d48",
   "metadata": {},
   "outputs": [],
   "source": [
    "mech_vrnts =set(tx_fusion_vrnt_gene_smpl_df.vrnt_id.unique()).union(set(tx_read_vrnt_gene_smpl_df.vrnt_id.unique()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "480f14ed",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tx_fusion_gene_smpl_rmv_tx_read_df[\"mechanism\"] = \"Transcript fusion\"\n",
    "tx_read_gene_smpl_df[\"mechanism\"] = \"Transcriptional readthrough\"\n",
    "tx_read_fusion_gene_smpl_df = pd.concat([tx_fusion_gene_smpl_rmv_tx_read_df, tx_read_gene_smpl_df])\n",
    "# check no duplicates \n",
    "tx_read_fusion_gene_smpl_df[[\"gene_id\", \"rna_id\"]].drop_duplicates().shape[0] == tx_read_fusion_gene_smpl_df.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "f7ce9951",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gene inversion mechanism \n",
    "misexp_gene_inv_df = misexp_vrnt_feat_df[(misexp_vrnt_feat_df.SVTYPE == \"INV\") & \n",
    "                                         (misexp_vrnt_feat_df.gene_id == \"ENSG00000114547\")\n",
    "                                        ]\n",
    "misexp_gene_inv_no_dups_df = misexp_gene_inv_df[[\"gene_id\", \"rna_id\"]].drop_duplicates()\n",
    "misexp_gene_inv_no_dups_df[\"mechanism\"] = \"Gene inversion\"\n",
    "# combine misexpression mechanisms \n",
    "misexp_mech_known_df = pd.concat([tx_read_fusion_gene_smpl_df, misexp_gene_inv_no_dups_df])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "303a91d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "misexp_gene_smpl_all_df = misexp_vrnt_feat_df[[\"gene_id\", \"rna_id\", \"TPM\", \"z-score\"]].drop_duplicates()\n",
    "misexp_gene_smpl_all_mech_df = pd.merge(misexp_gene_smpl_all_df, misexp_mech_known_df, on=[\"gene_id\", \"rna_id\"], how=\"left\").fillna(\"Unknown mechanism\")\n",
    "# count gene-sample pairs for each mechanism \n",
    "misexp_gene_smpl_mech_count_df = misexp_gene_smpl_all_mech_df.groupby([\"mechanism\"], as_index=False)[[\"gene_id\", \"rna_id\"]].count()\n",
    "misexp_gene_smpl_mech_count_df = misexp_gene_smpl_mech_count_df[[\"mechanism\", \"gene_id\"]].rename(columns={\"gene_id\": \"count\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "02e109cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to file \n",
    "misexp_gene_smpl_mech_count_path = outdir.joinpath(\"misexp_gene_smpl_mech_count.tsv\")\n",
    "misexp_gene_smpl_mech_count_df.to_csv(misexp_gene_smpl_mech_count_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1783b77e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total misexpressed gene sample pairs: 98\n",
      "Total misexpressed gene-sample pairs with putative mechanism: 41\n",
      "Proportion events with putative mechanism: 41/98\n",
      "Percentage events with putative mechanism: 41.83673469387755\n"
     ]
    }
   ],
   "source": [
    "# proportion of misexpression events with misexpression-associated SV that are unkown \n",
    "total_gene_smpl_pair = misexp_gene_smpl_all_mech_df.shape[0]\n",
    "print(f\"Total misexpressed gene sample pairs: {total_gene_smpl_pair}\")\n",
    "# unkown mechanism \n",
    "total_mech_gene_smpl_pair = misexp_gene_smpl_all_mech_df[misexp_gene_smpl_all_mech_df.mechanism != \"Unknown mechanism\"].shape[0]\n",
    "print(f\"Total misexpressed gene-sample pairs with putative mechanism: {total_mech_gene_smpl_pair}\")\n",
    "print(f\"Proportion events with putative mechanism: {total_mech_gene_smpl_pair}/{total_gene_smpl_pair}\")\n",
    "print(f\"Percentage events with putative mechanism: {(total_mech_gene_smpl_pair/total_gene_smpl_pair) *100}\")\n",
    "# write to file \n",
    "misexp_level_gene_smpl_mech_path = outdir.joinpath(\"misexp_level_gene_smpl_mech.tsv\")\n",
    "misexp_gene_smpl_all_mech_df.to_csv(misexp_level_gene_smpl_mech_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "2ed5e2b1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Transcriptional readthrough mean z-score: 33.95421999539768\n",
      "Unknown mechanism mean z-score: 25.887858193512624\n",
      "Transcript fusion mean z-score: 52.04248156837565\n",
      "Gene inversion mean z-score: 15.940637785356975\n"
     ]
    }
   ],
   "source": [
    "### average z-scores across mechanisms \n",
    "unknown_zscores = misexp_gene_smpl_all_mech_df[misexp_gene_smpl_all_mech_df.mechanism == \"Unknown mechanism\"][\"z-score\"].tolist()\n",
    "tx_read_zscores = misexp_gene_smpl_all_mech_df[misexp_gene_smpl_all_mech_df.mechanism == \"Transcriptional readthrough\"][\"z-score\"].tolist()\n",
    "tx_fusion_zscores = misexp_gene_smpl_all_mech_df[misexp_gene_smpl_all_mech_df.mechanism == \"Transcript fusion\"][\"z-score\"].tolist()\n",
    "gene_inversion_zscores = misexp_gene_smpl_all_mech_df[misexp_gene_smpl_all_mech_df.mechanism == \"Gene inversion\"][\"z-score\"].tolist()\n",
    "# mean expression levels \n",
    "mean_tx_read = sum(tx_read_zscores)/len(tx_read_zscores)\n",
    "print(f\"Transcriptional readthrough mean z-score: {mean_tx_read}\")\n",
    "mean_unknown = sum(unknown_zscores)/len(unknown_zscores)\n",
    "print(f\"Unknown mechanism mean z-score: {mean_unknown}\")\n",
    "mean_tx_fusion = sum(tx_fusion_zscores)/len(tx_fusion_zscores)\n",
    "print(f\"Transcript fusion mean z-score: {mean_tx_fusion}\")\n",
    "mean_gene_inv = sum(gene_inversion_zscores)/len(gene_inversion_zscores)\n",
    "print(f\"Gene inversion mean z-score: {mean_gene_inv}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6af17d9",
   "metadata": {},
   "source": [
    "### Remaining misexpression-associated variants \n",
    "\n",
    "* Examine TAD and CTCF cCRE binding sites "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "19efe785",
   "metadata": {},
   "outputs": [],
   "source": [
    "# subset to gene-sample pairs without a clear mechanism \n",
    "# this will remove suspected non-causal SVs for gene-sample pairs where we have identified a mechanism \n",
    "unknown_mech_gene_smpl_df = misexp_gene_smpl_all_mech_df[misexp_gene_smpl_all_mech_df.mechanism == \"Unknown mechanism\"]\n",
    "misexp_vrnt_feat_unkown_df = pd.merge(misexp_vrnt_feat_df, \n",
    "                                      unknown_mech_gene_smpl_df, \n",
    "                                      on=[\"gene_id\", \"rna_id\", \"TPM\", \"z-score\"], \n",
    "                                      how=\"inner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "5a51b772",
   "metadata": {},
   "outputs": [],
   "source": [
    "# overlapping TAD boundaries and CTCF binding sites "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f0994a93",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>vrnt_id</th>\n",
       "      <th>gene_id</th>\n",
       "      <th>gene_msc</th>\n",
       "      <th>SVTYPE</th>\n",
       "      <th>msc</th>\n",
       "      <th>chrom</th>\n",
       "      <th>sv_start</th>\n",
       "      <th>sv_end</th>\n",
       "      <th>gene_start</th>\n",
       "      <th>gene_end</th>\n",
       "      <th>...</th>\n",
       "      <th>oe_lof_upper</th>\n",
       "      <th>approved_target</th>\n",
       "      <th>decipher_gene</th>\n",
       "      <th>omim_gene</th>\n",
       "      <th>egan_id</th>\n",
       "      <th>rna_id</th>\n",
       "      <th>TPM</th>\n",
       "      <th>z-score</th>\n",
       "      <th>genotype</th>\n",
       "      <th>mechanism</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>425231</td>\n",
       "      <td>ENSG00000236714</td>\n",
       "      <td>non_coding_transcript_exon_variant</td>\n",
       "      <td>DUP</td>\n",
       "      <td>coding_sequence_variant</td>\n",
       "      <td>chr5</td>\n",
       "      <td>142642218</td>\n",
       "      <td>142903267</td>\n",
       "      <td>142716228</td>\n",
       "      <td>142761035</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>EGAN00001585971</td>\n",
       "      <td>INT_RNA7959310</td>\n",
       "      <td>22.477649</td>\n",
       "      <td>67.353095</td>\n",
       "      <td>(0, 1)</td>\n",
       "      <td>Unknown mechanism</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 59 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  vrnt_id          gene_id                            gene_msc SVTYPE  \\\n",
       "4  425231  ENSG00000236714  non_coding_transcript_exon_variant    DUP   \n",
       "\n",
       "                       msc chrom   sv_start     sv_end  gene_start   gene_end  \\\n",
       "4  coding_sequence_variant  chr5  142642218  142903267   142716228  142761035   \n",
       "\n",
       "   ... oe_lof_upper approved_target  decipher_gene  omim_gene  \\\n",
       "4  ...          NaN               0              0          0   \n",
       "\n",
       "           egan_id          rna_id        TPM    z-score  genotype  \\\n",
       "4  EGAN00001585971  INT_RNA7959310  22.477649  67.353095    (0, 1)   \n",
       "\n",
       "           mechanism  \n",
       "4  Unknown mechanism  \n",
       "\n",
       "[1 rows x 59 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Duplication criteria \n",
    "# overlap TAD boundary and CTCF binding site \n",
    "# overlap Genic enhancer or enhancer \n",
    "# overlap entire gene \n",
    "misexp_vrnt_feat_unkown_df[(misexp_vrnt_feat_unkown_df[\"SVTYPE\"] == \"DUP\") &\n",
    "                           (misexp_vrnt_feat_unkown_df[\"gm12878_shared_intersect_tad_boundary\"] == 1) &\n",
    "                           (misexp_vrnt_feat_unkown_df[\"CTCFonlyCTCFbound_all\"] == 1) & \n",
    "                           (misexp_vrnt_feat_unkown_df[\"position\"] == \"Entire gene\") &\n",
    "                           ((misexp_vrnt_feat_unkown_df[\"EnhG\"] == 1) | \n",
    "                           (misexp_vrnt_feat_unkown_df[\"Enh\"] == 1))\n",
    "                           ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "4862224b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>vrnt_id</th>\n",
       "      <th>gene_id</th>\n",
       "      <th>gene_msc</th>\n",
       "      <th>SVTYPE</th>\n",
       "      <th>msc</th>\n",
       "      <th>chrom</th>\n",
       "      <th>sv_start</th>\n",
       "      <th>sv_end</th>\n",
       "      <th>gene_start</th>\n",
       "      <th>gene_end</th>\n",
       "      <th>...</th>\n",
       "      <th>oe_lof_upper</th>\n",
       "      <th>approved_target</th>\n",
       "      <th>decipher_gene</th>\n",
       "      <th>omim_gene</th>\n",
       "      <th>egan_id</th>\n",
       "      <th>rna_id</th>\n",
       "      <th>TPM</th>\n",
       "      <th>z-score</th>\n",
       "      <th>genotype</th>\n",
       "      <th>mechanism</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>DEL_chr12_49269781_49291419</td>\n",
       "      <td>ENSG00000258334</td>\n",
       "      <td>downstream_gene_variant</td>\n",
       "      <td>DEL</td>\n",
       "      <td>stop_lost</td>\n",
       "      <td>chr12</td>\n",
       "      <td>49269781</td>\n",
       "      <td>49291419</td>\n",
       "      <td>49292630</td>\n",
       "      <td>49324576</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>EGAN00001240929</td>\n",
       "      <td>INT_RNA7879099</td>\n",
       "      <td>1.606799</td>\n",
       "      <td>54.427192</td>\n",
       "      <td>(0, 1)</td>\n",
       "      <td>Unknown mechanism</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>96700</td>\n",
       "      <td>ENSG00000180785</td>\n",
       "      <td>no_predicted_effect</td>\n",
       "      <td>DEL</td>\n",
       "      <td>transcript_ablation</td>\n",
       "      <td>chr11</td>\n",
       "      <td>4495092</td>\n",
       "      <td>4614586</td>\n",
       "      <td>4643419</td>\n",
       "      <td>4655488</td>\n",
       "      <td>...</td>\n",
       "      <td>1.68</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>EGAN00001361345</td>\n",
       "      <td>INT_RNA7710317</td>\n",
       "      <td>2.233265</td>\n",
       "      <td>45.275653</td>\n",
       "      <td>(0, 1)</td>\n",
       "      <td>Unknown mechanism</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>96700</td>\n",
       "      <td>ENSG00000180785</td>\n",
       "      <td>no_predicted_effect</td>\n",
       "      <td>DEL</td>\n",
       "      <td>transcript_ablation</td>\n",
       "      <td>chr11</td>\n",
       "      <td>4495092</td>\n",
       "      <td>4614586</td>\n",
       "      <td>4643419</td>\n",
       "      <td>4655488</td>\n",
       "      <td>...</td>\n",
       "      <td>1.68</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>EGAN00001584413</td>\n",
       "      <td>INT_RNA7879036</td>\n",
       "      <td>2.224477</td>\n",
       "      <td>45.097190</td>\n",
       "      <td>(0, 1)</td>\n",
       "      <td>Unknown mechanism</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 59 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                        vrnt_id          gene_id                 gene_msc  \\\n",
       "7   DEL_chr12_49269781_49291419  ENSG00000258334  downstream_gene_variant   \n",
       "65                        96700  ENSG00000180785      no_predicted_effect   \n",
       "73                        96700  ENSG00000180785      no_predicted_effect   \n",
       "\n",
       "   SVTYPE                  msc  chrom  sv_start    sv_end  gene_start  \\\n",
       "7     DEL            stop_lost  chr12  49269781  49291419    49292630   \n",
       "65    DEL  transcript_ablation  chr11   4495092   4614586     4643419   \n",
       "73    DEL  transcript_ablation  chr11   4495092   4614586     4643419   \n",
       "\n",
       "    gene_end  ... oe_lof_upper approved_target  decipher_gene  omim_gene  \\\n",
       "7   49324576  ...          NaN               0              0          0   \n",
       "65   4655488  ...         1.68               0              0          1   \n",
       "73   4655488  ...         1.68               0              0          1   \n",
       "\n",
       "            egan_id          rna_id       TPM    z-score  genotype  \\\n",
       "7   EGAN00001240929  INT_RNA7879099  1.606799  54.427192    (0, 1)   \n",
       "65  EGAN00001361345  INT_RNA7710317  2.233265  45.275653    (0, 1)   \n",
       "73  EGAN00001584413  INT_RNA7879036  2.224477  45.097190    (0, 1)   \n",
       "\n",
       "            mechanism  \n",
       "7   Unknown mechanism  \n",
       "65  Unknown mechanism  \n",
       "73  Unknown mechanism  \n",
       "\n",
       "[3 rows x 59 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Deletion criteria, \n",
    "# do not overlap the misexpressed gene \n",
    "# overlap TAD boundary \n",
    "# overlap CTCF-binding site \n",
    "gene_overlap_positions = [\"Partial overlap 5' end\", \"Partial overlap 3' end\", 'Internal', \"Entire gene\"]\n",
    "misexp_vrnt_feat_unkown_df[(misexp_vrnt_feat_unkown_df[\"SVTYPE\"] == \"DEL\") &\n",
    "                           (misexp_vrnt_feat_unkown_df[\"gm12878_shared_intersect_tad_boundary\"] == 1) &\n",
    "                           (misexp_vrnt_feat_unkown_df[\"CTCFonlyCTCFbound_all\"] == 1) &\n",
    "                           ~(misexp_vrnt_feat_unkown_df[\"position\"].isin(gene_overlap_positions))\n",
    "                           ]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
